cont <- params$controlCellLine
exp <- params$experimentalCellLine
countmatrix.all <- params$countmatrix.all
metadata.all <- params$metadata.all
rm(params) # Remove the parameters so that we can make subsequent parameterized calls
metadata.pair <- as.data.frame(metadata.all) %>%
filter(CellLine == cont | CellLine == exp)
as.data.frame(metadata.pair)
countmatrix.pair <- countmatrix.all[, metadata.pair$ShortName]
as.data.frame(countmatrix.pair)
Run Deseq on the data set.
# Saving time by just loading the dds we already ran (recent changes are all after this point)
load(str_interp("Rdata/${exp}_vs_${cont}_dds.RData"))
res <- results(dds.pair, contrast = c("CellLine", exp, cont), alpha = 0.05)
res <- res[order(res$log2FoldChange), ]
outFile <- str_interp("output/${exp}_vs_${cont}_deseq_results.csv")
write.csv(as.data.frame(res), file = outFile)
Filter res for padj < 0.05
res.filtered <- as.data.frame(res) %>%
filter(padj < 0.05)
# filter(log2FoldChange >= 1.5 | log2FoldChange <= -1.5)
res.filtered <- res.filtered[order(res.filtered$log2FoldChange, decreasing = TRUE),]
res.filtered
up.unfiltered <- subset(res, log2FoldChange > 0)
up.unfiltered <- up.unfiltered[order(up.unfiltered$log2FoldChange, decreasing = TRUE), ]
outFile <- str_interp("output/${exp}_vs_${cont}_all_upregulated_genes.csv")
write.csv(up.unfiltered[, c("log2FoldChange", "padj")], file = outFile)
up.unfiltered[, c("log2FoldChange", "padj")]
log2 fold change (MLE): CellLine OVCAR4B vs OVCAR4
DataFrame with 10409 rows and 2 columns
log2FoldChange padj
<numeric> <numeric>
CACNA1A 10.66782 4.56164e-18
MAGED4B 9.67037 7.59414e-18
HEY2 9.18338 2.83462e-59
HNRNPUL2-BSCL2 9.07998 4.88830e-02
ERCC6-PGBD3 8.91472 7.62391e-02
... ... ...
LOC102723735 0.000570077 0.999509
MYBL2 0.000398202 0.999378
MRPL45 0.000376418 0.999351
ZNF710 0.000229081 0.999509
TFAP2A 0.000154840 0.999704
up <- subset(res.filtered, log2FoldChange > 0)
up <- up[order(up$log2FoldChange, decreasing = TRUE), ]
outFile <- str_interp("output/${exp}_vs_${cont}_significantly_upregulated_genes.csv")
write.csv(up[, c("log2FoldChange", "padj")], file = outFile)
print(up[, c("log2FoldChange", "padj")])
down.unfiltered <- subset(res, log2FoldChange < 0)
down.unfiltered <- down.unfiltered[order(down.unfiltered$log2FoldChange, decreasing = FALSE), ]
outFile <- str_interp("output/${exp}_vs_${cont}_all_downregulated_genes.csv")
write.csv(down.unfiltered[, c("log2FoldChange", "padj")], file = outFile)
print(down.unfiltered[, c("log2FoldChange", "padj")])
log2 fold change (MLE): CellLine OVCAR4B vs OVCAR4
DataFrame with 10255 rows and 2 columns
log2FoldChange padj
<numeric> <numeric>
MYL9 -15.1040 8.29788e-33
EMX2 -13.8208 8.25117e-28
FBN2 -13.1692 1.04845e-48
NPAS3 -12.5376 2.05682e-22
EMX2OS -12.4689 6.47393e-43
... ... ...
LOC389895 -0.000594419 0.999351
LEPREL2 -0.000562425 0.999452
MUM1 -0.000538818 0.999208
SPIRE1 -0.000351570 0.999142
SMIM20 -0.000239130 0.999509
down <- subset(res.filtered, log2FoldChange < 0)
down <- down[order(down$log2FoldChange, decreasing = TRUE), ]
outFile <- str_interp("output/${exp}_vs_${cont}_significantly_downregulated_genes.csv")
write.csv(down[, c("log2FoldChange", "padj")], file = outFile)
print(down[, c("log2FoldChange", "padj")])
as.data.frame(res) %>%
ggplot(aes(x = log2FoldChange, y = -log10(padj), label = rownames(res))) +
geom_point() +
theme_minimal() +
scale_color_manual(values = c("black", "blue", "red")) +
geom_text_repel() +
geom_hline(yintercept = 1.301) +
geom_vline(xintercept = 1.2) +
geom_vline(xintercept = -1.2) +
xlim(-10, 10)
Warning: Removed 13010 rows containing missing values (`geom_point()`).
Warning: Removed 13010 rows containing missing values (`geom_text_repel()`).
Warning: ggrepel: 16703 unlabeled data points (too many overlaps). Consider increasing
max.overlaps
Perform gene set enrichment analysis using Cluster Profiler. This gives us GO pathways that are significantly regulated based on the log2fold change of expression of individual genes.
Using a pvalue Cutoff of 0.05
gene_list <- res$log2FoldChange
names(gene_list) <- rownames(res)
gene_list <- sort(gene_list, decreasing = TRUE)
# Set the seed so our results are reproducible:
set.seed(2023)
gsea_res <- gseGO(gene_list, ont = "BP", OrgDb = "org.Hs.eg.db", keyType = "SYMBOL", seed = TRUE, pvalueCutoff = 0.05)
preparing geneSet collections...
GSEA analysis...
Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (14.08% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize = minSize, : For some
pathways, in reality P-values are less than 1e-10. You can set the `eps` argument to zero for
better estimation.
leading edge analysis...
done...
# Format output
gsea_res_df <- as.data.frame(gsea_res)
gsea_res_df <- gsea_res_df %>%
mutate(original_row_num = row_number())
gsea_res_df <- gsea_res_df[order(gsea_res_df$NES, decreasing = TRUE),]
row.names(gsea_res_df) <- gsea_res_df$ID
NES is the normalized enrichment score.
gsea_res_df_short <- gsea_res_df[c("pvalue", "p.adjust", "NES", "Description")]
gsea_res_df_short$"core_enrichment_genes" <- gsea_res_df$core_enrichment
gsea_res_df_short.up <- subset(gsea_res_df_short, gsea_res_df_short$NES >= 0)
outFile <- str_interp("output/${exp}_vs_${cont}_significantly_upregulated_pathways.csv")
write.csv(gsea_res_df_short.up, file = outFile)
gsea_res_df_short.up
GSEA plot of the five most upregulated pathways (or least downregulated)
maxIndex <- min(5, nrow(gsea_res_df)) # Prevents us from trying to access out of bounds if there are not five pathways
top5PathwaysIds = gsea_res_df[1:maxIndex, "original_row_num"]
gseaplot2(gsea_res, geneSetID = top5PathwaysIds, pvalue_table = FALSE, ES_geom = "dot")
Volcano Plot (Average NES & adjusted p value)
as.data.frame(gsea_res_df_short.up) %>%
ggplot(aes(x = NES, y = -log10(p.adjust), label = rownames(gsea_res_df_short.up))) +
geom_point() +
theme_minimal() +
scale_color_manual(values = c("black", "blue", "red")) +
geom_text_repel() +
geom_hline(yintercept = 1.301) +
geom_vline(xintercept = 1.2) +
geom_vline(xintercept = -1.2) +
xlim(-10, 10)
Warning: ggrepel: 30 unlabeled data points (too many overlaps). Consider increasing
max.overlaps
gsea_res_df_short.down <- subset(gsea_res_df_short, gsea_res_df_short$NES <= 0)
outFile <- str_interp("output/${exp}_vs_${cont}_significantly_downregulated_pathways.csv")
write.csv(gsea_res_df_short.down, file = outFile)
gsea_res_df_short.down
GSEA plot of the five most downregulated pathways (or least upregulated)
minIndex <- max(1, nrow(gsea_res_df) - 5) # Prevents us from trying to access out of bounds if there are not five downregulated pathways
bottom5PathwaysIds = gsea_res_df[minIndex:nrow(gsea_res_df), "original_row_num"]
gseaplot2(gsea_res, geneSetID = bottom5PathwaysIds, pvalue_table = FALSE, ES_geom = "dot")
Volcano plot (Average NES & adjusted p value)
as.data.frame(gsea_res_df_short.down) %>%
ggplot(aes(x = NES, y = -log10(p.adjust), label = rownames(gsea_res_df_short.down))) +
geom_point() +
theme_minimal() +
scale_color_manual(values = c("black", "blue", "red")) +
geom_text_repel() +
geom_hline(yintercept = 1.301) +
geom_vline(xintercept = 1.2) +
geom_vline(xintercept = -1.2) +
xlim(-10, 10)
Warning: ggrepel: 17 unlabeled data points (too many overlaps). Consider increasing
max.overlaps
Use Revigo to cluster upregulated pathways
revigo_input.cellline.up <- gsea_res_df_short.up[c("p.adjust")]
rownames(revigo_input.cellline.up) <- rownames(gsea_res_df_short.up)
simMatrix <- calculateSimMatrix(rownames(revigo_input.cellline.up),
orgdb = "org.Hs.eg.db",
ont = "BP",
method = "Rel"
)
preparing gene to GO mapping data...
preparing IC data...
scores <- setNames(-log10(revigo_input.cellline.up$p.adjust), rownames(revigo_input.cellline.up))
if (nrow(revigo_input.cellline.up) > 1) {
reducedTerms <- reduceSimMatrix(simMatrix,
scores,
threshold = 0.7,
orgdb = "org.Hs.eg.db"
)
} else {
reducedTerms <- data.frame(matrix(ncol = 0, nrow = 0))
print("There will be no graphs appearing below this because there were not enough significantly upregulated pathways to meaningfully cluster them")
}
Revigo interactive scatter plot. Distances represent the similarity between terms, axes are the first 2 components of a PCA plot, Each bubble indicates the representative (chosen mostly by p-value) from a cluster of terms. Size of the bubble indicates the generality of the term (large meaning a more general term).
if (nrow(reducedTerms) > 2) {
revigo_scatterplot(simMatrix, reducedTerms)
}
Revigo heatmap plot. Similar terms clustered
if (nrow(reducedTerms) > 2) {
heatmapPlot(simMatrix,
reducedTerms,
annotateParent = TRUE,
annotationLabel = "parentTerm",
fontsize = 6
)
}
This is the same content, but interactive.
if (nrow(reducedTerms) > 2) {
revigo_heatmap(simMatrix, reducedTerms)
}
Warning: Specifying width/height in layout() is now deprecated.
Please specify in ggplotly() or plot_ly()
Revigo treemap plot. Terms grouped/colored based on parent. Space is proportional to statistical significance of the GO term (-log10(pvalue)).
if (nrow(reducedTerms) > 2) {
treemapPlot(reducedTerms)
}
Use Revigo to cluster downregulated pathways
revigo_input.cellline.down <- gsea_res_df_short.down[c("p.adjust")]
rownames(revigo_input.cellline.down) <- rownames(gsea_res_df_short.down)
simMatrix <- calculateSimMatrix(rownames(revigo_input.cellline.down),
orgdb = "org.Hs.eg.db",
ont = "BP",
method = "Rel"
)
preparing gene to GO mapping data...
preparing IC data...
scores <- setNames(-log10(revigo_input.cellline.down$p.adjust), rownames(revigo_input.cellline.down))
if (nrow(revigo_input.cellline.down) > 1) {
reducedTerms <- reduceSimMatrix(simMatrix,
scores,
threshold = 0.7,
orgdb = "org.Hs.eg.db"
)
} else {
reducedTerms <- data.frame(matrix(ncol = 0, nrow = 0))
print("There will be no graphs appearing below this because there were not enough significantly downregulated pathways to meaningfully cluster them")
}
Revigo interactive scatter plot. Distances represent the similarity between terms, axes are the first 2 components of a PCA plot, Each bubble indicates the representative (chosen mostly by p-value) from a cluster of terms. Size of the bubble indicates the generality of the term (large meaning a more general term).
if (nrow(reducedTerms) > 2) {
revigo_scatterplot(simMatrix, reducedTerms)
}
Revigo heatmap plot. Similar terms clustered
if (nrow(reducedTerms) > 2) {
heatmapPlot(simMatrix,
reducedTerms,
annotateParent = TRUE,
annotationLabel = "parentTerm",
fontsize = 6
)
}
This is the same content, but interactive.
if (nrow(reducedTerms) > 2) {
revigo_heatmap(simMatrix, reducedTerms)
}
Warning: Specifying width/height in layout() is now deprecated.
Please specify in ggplotly() or plot_ly()
Revigo treemap plot. Terms grouped/colored based on parent. Space is proportional to statistical significance of the GO term (-log10(pvalue)).
if (nrow(reducedTerms) > 2) {
treemapPlot(reducedTerms)
}